Read in dataset

# read RNA expression counts and patient data
load(file="data/readMatrix_PBMC_BulkRNAseq_CVID.Rdata")

if (!file.exists("translated_genes.txt")) {
    source("gene_translation.R")
}


mt.counts.df <- read.csv("translated_genes.txt", row.names = 1)
groups <- sample.info$disease 

groups <- ifelse(groups %in% c("UCVID", "CVID"), 
                "Deficient", 
                as.character(groups)) 

# store index of NCVID subjects
remove_NCVID_index <- which(groups == "NCVID")

# update group label vector
groups <- groups[groups != "NCVID"] 
groups <- factor(groups)

# check modified vector
print(groups) 
##  [1] healthy   healthy   healthy   healthy   healthy   healthy   Deficient
##  [8] Deficient Deficient Deficient Deficient Deficient Deficient Deficient
## [15] Deficient Deficient Deficient Deficient Deficient Deficient Deficient
## [22] Deficient Deficient Deficient Deficient Deficient Deficient Deficient
## [29] Deficient Deficient Deficient Deficient Deficient Deficient healthy  
## [36] healthy   healthy   healthy   healthy   healthy   healthy   healthy  
## Levels: Deficient healthy
#updating dataframes
mt.counts.df <- mt.counts.df[,-remove_NCVID_index]
sample.info <- sample.info[-remove_NCVID_index,]
sample.info$disease <- groups

#  re-store results
counts <- mt.counts.df
meta_data <- sample.info

# isolate group indicator variable
group <- meta_data$disease

# aggregate patient and expression data into a single object
se_cvid <- SummarizedExperiment(assays=list(counts=counts),
                     colData = meta_data)


# isolating treatment group experiments
treatments <- meta_data$treatment
ctrl_index <- which(treatments == "ctrl")
LPS_index <- which(treatments == "LPS")

counts_ctrl <- counts[,-LPS_index]
counts_LPS <- counts[,-ctrl_index]

meta_data_ctrl <- meta_data[-LPS_index,]
meta_data_LPS <- meta_data[-ctrl_index,]

se_cvid_ctrl <- SummarizedExperiment(assays=list(counts=counts_ctrl),
                     colData = meta_data_ctrl)
se_cvid_LPS <- SummarizedExperiment(assays=list(counts=counts_LPS),
                     colData = meta_data_LPS)

# create counts per million, log counts and log counts per million features
# and save them into new assays
se_cvid <- mkAssay(se_cvid, log = TRUE, counts_to_CPM = TRUE)
# same for control only
se_cvid_ctrl <- mkAssay(se_cvid_ctrl, log = TRUE, counts_to_CPM = TRUE)
# same for LPS treatment group only
se_cvid_LPS <- mkAssay(se_cvid_LPS, log = TRUE, counts_to_CPM = TRUE)


# display all assays, control and LPS treatment
assays(se_cvid)
## List of length 4
## names(4): counts log_counts counts_cpm log_counts_cpm
# display all assays, control only
assays(se_cvid_ctrl)
## List of length 4
## names(4): counts log_counts counts_cpm log_counts_cpm
# display all assays, LPS treatment only
assays(se_cvid_LPS)
## List of length 4
## names(4): counts log_counts counts_cpm log_counts_cpm

Visualization and Dimension reduction

PCA

Entire sample - Healthy Control & LPS Treatment
# fit PCA model to log cpm data
# note: transpose data to make it tidy
set.seed(1)
pca_out <- prcomp(t(assay(se_cvid,"log_counts_cpm")))

# define dataframe with PCA components and immune deficiency disease (IDC)
pca_plot <- as.data.frame(pca_out$x) # X, Y, ...
pca_plot$disease <- as.factor(se_cvid$disease) # color/group

# plot PC1 and PC2 by IDC
g <- pca_plot %>% ggplot(aes(x=PC1, y=PC2, color=disease)) +
  geom_point(size=1.5) + xlab("PC 1") + ylab("PC 2") +
  geom_text(label=colnames(se_cvid), nudge_y = 5,) +
  theme(plot.title = element_text(hjust = 0.5)) +
  ggtitle("PCA Plot - Entire Sample")

plot(g)

UMAP

Entire sample - Healthy Control & LPS Treatment
# fit UMAP model to log cpm data
# note: transpose data to make it tidy
set.seed(1)
umap.defaults$n_neighbors=5
umap_out <- umap(t(assay(se_cvid,"log_counts_cpm")), config=umap.defaults)

# define dataframe with UMAP dimensions and IDC
umap_plot <- as.data.frame(umap_out$layout) # X, Y, ...
umap_plot$disease <- as.factor(se_cvid$disease) # color / group

# plot UMAP1 and UMAP2 by IDC
g <- umap_plot %>% ggplot(aes(x=V1, y=V2, color=disease)) +
  geom_point(size=1.5) + xlab("UMAP1") + ylab("UMAP2") +
  geom_text(label=colnames(se_cvid), nudge_y = 0.1,) +
  theme(plot.title = element_text(hjust = 0.5)) +
  ggtitle("UMAP Plot - Entire Sample")

plot(g)

PCA (Coloring Treatment Groups)

# fit PCA model to log cpm data
# note: transpose data to make it tidy
set.seed(1)
pca_out <- prcomp(t(assay(se_cvid,"log_counts_cpm")))

# define dataframe with PCA components and immune deficiency disease (IDC)
pca_plot <- as.data.frame(pca_out$x) # X, Y, ...
pca_plot$treatment <- as.factor(se_cvid$treatment) # color/group

# plot PC1 and PC2 by IDC
g <- pca_plot %>% ggplot(aes(x=PC1, y=PC2, color=treatment)) +
  geom_point(size=1.5) + xlab("PC 1") + ylab("PC 2") +
  geom_text(label=colnames(se_cvid), nudge_y = 5,) +
  theme(plot.title = element_text(hjust = 0.5)) +
  ggtitle("PCA Plot - Entire Sample with Treatment Groups")

plot(g)

UMAP (Coloring Treatment Groups)

# fit UMAP model to log cpm data
# note: transpose data to make it tidy
set.seed(1)
umap.defaults$n_neighbors=5
umap_out <- umap(t(assay(se_cvid,"log_counts_cpm")), config=umap.defaults)

# define dataframe with UMAP dimensions and IDC
umap_plot <- as.data.frame(umap_out$layout) # X, Y, ...
umap_plot$treatment <- as.factor(se_cvid$treatment) # color / group

# plot UMAP1 and UMAP2 by IDC
g <- umap_plot %>% ggplot(aes(x=V1, y=V2, color=treatment)) +
  geom_point(size=1.5) + xlab("UMAP1") + ylab("UMAP2") +
  geom_text(label=colnames(se_cvid), nudge_y = 0.1,) +
  theme(plot.title = element_text(hjust = 0.5)) +
  ggtitle("UMAP Plot - Entire Sample with Treatment Groups")

plot(g)

As we can observe, the treatment variable overwhelms the dimension reduction, indicating we should run separate analysis for each treatment group.

Group Analyses

LPS Treatment Group

Visualization and Dimension reduction

PCA LPS Group

# fit PCA model to log cpm data for LPS treatment group
# note: transpose data to make it tidy
set.seed(1)
pca_out <- prcomp(t(assay(se_cvid_LPS,"log_counts_cpm")))

# define dataframe with PCA components and immune deficiency disease (IDC)
pca_plot <- as.data.frame(pca_out$x) # X, Y, ...
pca_plot$disease <- as.factor(se_cvid_LPS$disease) # color/group

# plot PC1 and PC2 by IDC
g <- pca_plot %>% ggplot(aes(x=PC1, y=PC2, color=disease)) +
  geom_point(size=1.5) + xlab("PC 1") + ylab("PC 2") +
  geom_text(label=colnames(se_cvid_LPS), nudge_y = 5,) +
  theme(plot.title = element_text(hjust = 0.5)) +
  ggtitle("PCA Plot - LPS group")

plot(g)

UMAP LPS Group

# fit UMAP model to log cpm data
# note: transpose data to make it tidy
set.seed(1)
umap.defaults$n_neighbors=5
umap_out <- umap(t(assay(se_cvid_LPS,"log_counts_cpm")), config=umap.defaults)

# define dataframe with UMAP dimensions and IDC
umap_plot <- as.data.frame(umap_out$layout) # X, Y, ...
umap_plot$disease <- as.factor(se_cvid_LPS$disease) # color / group

# plot UMAP1 and UMAP2 by IDC
g <- umap_plot %>% ggplot(aes(x=V1, y=V2, color=disease)) +
  geom_point(size=1.5) + xlab("UMAP1") + ylab("UMAP2") +
  geom_text(label=colnames(se_cvid_LPS), nudge_y = 0.1,) +
  theme(plot.title = element_text(hjust = 0.5)) +
  ggtitle("UMAP Plot - LPS group")

plot(g)

Differential Expression

DESeq2 Analysis

# format data into a DESeq2-friendly Data Structure
dds <- DESeqDataSetFromMatrix(countData=counts, colData=meta_data, design=~disease)
#colData is a data frame of demographic/phenotypic data

# filter for genes with significant expression  
# expression should be > 0, but > 100 should be better
dds<-dds[rowSums(counts(dds))>1,] #Gene Filtering

# fit negative binomial regression
dds<-DESeq(dds) #Performs estimation of size factors, dispersion, and negative binomial GLM f#itting

# extracting results, ordering them by adjusted p-value
res <- results(dds)[order(results(dds)[,6]),]
#res[1:10,]

# dislpay results for the top 1000 most significant genes
datatable(data.frame(res[1:1000,]))
# store the name of the top 250 most significant genes
deseq250 <- rownames(res)[1:250]
LPS.deseq.res <- data.frame(res)

Heatmap of DEGs (DESeq2)

# Make a Heatmap of DEGs

# All together: extract the log cpm of the top 250 genes and store that in a matrix
# # get the name of the genes ordered by adjusted p-val
top_genes = rownames(results(dds)[order(results(dds)$padj),])[1:250]
# # convert log cpm to a matrix structure sorted by the top genes
mat = as.matrix(assay(se_cvid_LPS,"log_counts_cpm"))[top_genes,] 

# normalize data row-wise
mat = t(scale(t(mat)))

# store IDC into a single-column dataframe
df=data.frame(disease=colData(se_cvid_LPS)$disease) 

# create HeatMap with some annotations
ha_colors = list(disease=c("healthy"="Blue","Deficient"="Red"))

ha = HeatmapAnnotation(df = df, col = ha_colors)
Heatmap(mat,show_row_names=F,show_column_names = F, top_annotation=ha)

Limma Analysis

# collapse data into healthy or not

# Store data in Limma-friendly Data Structure
dge <- DGEList(counts=counts, group=group) #From edgeR, Computes library size

# filter for genes with significant expression
counts<-counts[which(rowSums(cpm(counts))>1),] #Gene Filtering #(mask)
dge <- DGEList(counts=counts, group=group) #Re-compute library size #(filter)

# Trimmed-Mean Normalization of data
dge <- calcNormFactors(dge) #TMM normalization

# Create design matrix (intercept + indicator variable)

design<-model.matrix(~group)

# distribution normalizing transformation
v<-voom(dge, design) #voom transform to calculate weights to eliminate mean-variance #relationship
#use usual limma pipelines

# fit empirical bayes regression model
fit<-lmFit(v,design)
fit<-eBayes(fit)

# organize data by top ranked genes when predicting IDC (coef = col#2 = UCVID)
# topTable(fit, coef=2)

# display top 1000 most significant genes
datatable(topTable(fit, coef=2, number=1000))
# store the name of the top 250 most significant genes
limma250 <- rownames(topTable(fit, coef=2, number=250))
LPS.limma.res <- data.frame(topTable(fit, coef=2))

Heatmap of DEGs

# Make a Heatmap of DEGs

# extract the log cpm of the top 250 genes and store that in a matrix
# get the name of the top 250 most significant genes
top_genes = rownames(topTable(fit, coef=2, number=250))
# convert log cpm to a matrix structure
mat = as.matrix(assay(se_cvid_LPS,"log_counts_cpm"))[top_genes,] 

# normalize data row-wise
mat = t(scale(t(mat)))

# store IDC into a single-column dataframe
df=data.frame(disease=colData(se_cvid_LPS)$disease) 

# create HeatMap with some annotations
ha = HeatmapAnnotation(df = df, col=ha_colors)
Heatmap(mat,show_row_names=F,show_column_names=F, top_annotation=ha)

Pathway analysis

EnrichR analysis

DESeq2 Genes
# based on the results of DEseq, cross-reference the gene with pathway databases to infer which pathways are the most significant

enriched <- enrichr(deseq250, dbs)
## Uploading data to Enrichr... Done.
##   Querying WikiPathways_2016... Done.
##   Querying KEGG_2016... Done.
##   Querying Panther_2016... Done.
##   Querying Reactome_2016... Done.
## Parsing results... Done.
DESeq2 WikiPathways
datatable(enriched$WikiPathways_2016)
DESeq2 KEGG
datatable(enriched$KEGG_2016)
DESeq2 Panther
datatable(enriched$Panther_2016)
DESeq2 Reactome
datatable(enriched$Reactome_2016)
Limma Genes (CVID)
enriched <- enrichr(limma250, dbs)
## Uploading data to Enrichr... Done.
##   Querying WikiPathways_2016... Done.
##   Querying KEGG_2016... Done.
##   Querying Panther_2016... Done.
##   Querying Reactome_2016... Done.
## Parsing results... Done.
Limma WikiPathways
datatable(enriched$WikiPathways_2016)
Limma KEGG
datatable(enriched$KEGG_2016)
Limma Panther
datatable(enriched$Panther_2016)
Limma Reactome
datatable(enriched$Reactome_2016)

Healthy Control Group

Visualization and Dimension reduction

PCA Healthy Group

# fit PCA model to log cpm data for healthy control group
# note: transpose data to make it tidy
set.seed(1)
pca_out <- prcomp(t(assay(se_cvid_ctrl,"log_counts_cpm")))

# define dataframe with PCA components and immune deficiency disease (IDC)
pca_plot <- as.data.frame(pca_out$x) # X, Y, ...
pca_plot$disease <- as.factor(se_cvid_ctrl$disease) # color/group

# plot PC1 and PC2 by IDC
g <- pca_plot %>% ggplot(aes(x=PC1, y=PC2, color=disease)) +
  geom_point(size=1.5) + xlab("PC 1") + ylab("PC 2") +
  geom_text(label=colnames(se_cvid_ctrl), nudge_y = 5,) +
  theme(plot.title = element_text(hjust = 0.5)) +
  ggtitle("PCA Plot - Control group")

plot(g)

UMAP Healthy Group

# fit UMAP model to log cpm data
# note: transpose data to make it tidy
set.seed(1)
umap.defaults$n_neighbors=5
umap_out <- umap(t(assay(se_cvid_ctrl,"log_counts_cpm")), config=umap.defaults)

# define dataframe with UMAP dimensions and IDC
umap_plot <- as.data.frame(umap_out$layout) # X, Y, ...
umap_plot$disease <- as.factor(se_cvid_ctrl$disease) # color / group

# plot UMAP1 and UMAP2 by IDC
g <- umap_plot %>% ggplot(aes(x=V1, y=V2, color=disease)) +
  geom_point(size=1.5) + xlab("UMAP1") + ylab("UMAP2") +
  geom_text(label=colnames(se_cvid_ctrl), nudge_y = 0.1,) +
  theme(plot.title = element_text(hjust = 0.5)) +
  ggtitle("UMAP Plot - Control group")

plot(g)

Differential Expression

DESeq2 Analysis

# format data into a DESeq2-friendly Data Structure
dds <- DESeqDataSetFromMatrix(countData=counts, colData=meta_data, design=~disease)
#colData is a data frame of demographic/phenotypic data

# filter for genes with significant expression  
# expression should be > 0, but > 100 should be better
dds<-dds[rowSums(counts(dds))>1,] #Gene Filtering

# fit negative binomial regression
dds<-DESeq(dds) #Performs estimation of size factors, dispersion, and negative binomial GLM f#itting

# extracting results, ordering them by adjusted p-value
res <- results(dds)[order(results(dds)[,6]),]
#res[1:10,]

# display results for the top 1000 most significant genes
datatable(data.frame(res[1:1000,]))
# store the name of the top 250 most significant genes
deseq250 <- rownames(res)[1:250]
PBMC.deseq.res <- data.frame(res)

Heatmap of DEGs (DESeq2)

# Make a Heatmap of DEGs

# All together: extract the log cpm of the top 250 genes and store that in a matrix
# # get the name of the genes ordered by adjusted p-val
top_genes = rownames(results(dds)[order(results(dds)$padj),])[1:250]
# # convert log cpm to a matrix structure sorted by the top genes
mat = as.matrix(assay(se_cvid_ctrl,"log_counts_cpm"))[top_genes,] 

# normalize data row-wise
mat = t(scale(t(mat)))

# store IDC into a single-column dataframe
df=data.frame(disease=colData(se_cvid_ctrl)$disease) 

# create HeatMap with some annotations
ha_colors = list(disease=c("healthy"="Blue","Deficient"="Red"))

ha = HeatmapAnnotation(df = df, col = ha_colors)
Heatmap(mat,show_row_names=F,show_column_names = F, top_annotation=ha)

Limma Analysis

# collapse data into healthy or not

# Store data in Limma-friendly Data Structure
dge <- DGEList(counts=counts, group=group) #From edgeR, Computes library size

# filter for genes with significant expression
counts<-counts[which(rowSums(cpm(counts))>1),] #Gene Filtering #(mask)
dge <- DGEList(counts=counts, group=group) #Re-compute library size #(filter)

# Trimmed-Mean Normalization of data
dge <- calcNormFactors(dge) #TMM normalization

# Create design matrix (intercept + indicator variable)

design<-model.matrix(~group)

# distribution normalizing transformation
v<-voom(dge, design) #voom transform to calculate weights to eliminate mean-variance #relationship
#use usual limma pipelines

# fit empirical bayes regression model
fit<-lmFit(v,design)
fit<-eBayes(fit)

# organize data by top ranked genes when predicting IDC (coef = col#2 = UCVID)
# topTable(fit, coef=2)

# display top 1000 most significant genes
datatable(topTable(fit, coef=2, number=1000))
# store the name of the top 250 most significant genes
limma250 <- rownames(topTable(fit, coef=2, number=250))
PBMC.limma.res <- data.frame(topTable(fit, coef=2))

Heatmap of DEGs

# Make a Heatmap of DEGs

# extract the log cpm of the top 250 genes and store that in a matrix
# get the name of the top 250 most significant genes
top_genes = rownames(topTable(fit, coef=2, number=250))
# convert log cpm to a matrix structure
mat = as.matrix(assay(se_cvid_ctrl,"log_counts_cpm"))[top_genes,] 

# normalize data row-wise
mat = t(scale(t(mat)))

# store IDC into a single-column dataframe
df=data.frame(disease=colData(se_cvid_ctrl)$disease) 

# create HeatMap with some annotations
ha = HeatmapAnnotation(df = df, col=ha_colors)
Heatmap(mat,show_row_names=F,show_column_names=F, top_annotation=ha)

Pathway analysis

EnrichR analysis

DESeq2 Genes
# based on the results of DEseq, cross-reference the gene with pathway databases to infer which pathways are the most significant

enriched <- enrichr(deseq250, dbs)
## Uploading data to Enrichr... Done.
##   Querying WikiPathways_2016... Done.
##   Querying KEGG_2016... Done.
##   Querying Panther_2016... Done.
##   Querying Reactome_2016... Done.
## Parsing results... Done.
DESeq2 WikiPathways
datatable(enriched$WikiPathways_2016)
DESeq2 KEGG
datatable(enriched$KEGG_2016)
DESeq2 Panther
datatable(enriched$Panther_2016)
DESeq2 Reactome
datatable(enriched$Reactome_2016)
Limma Genes (CVID)
enriched <- enrichr(limma250, dbs)
## Uploading data to Enrichr... Done.
##   Querying WikiPathways_2016... Done.
##   Querying KEGG_2016... Done.
##   Querying Panther_2016... Done.
##   Querying Reactome_2016... Done.
## Parsing results... Done.
Limma WikiPathways
datatable(enriched$WikiPathways_2016)
Limma KEGG
datatable(enriched$KEGG_2016)
Limma Panther
datatable(enriched$Panther_2016)
Limma Reactome
datatable(enriched$Reactome_2016)

Creating CSV for Comparative Analysis

#combine result lists into dataframe

top.250.results.df <- data.frame(ctrl_deseq=ctrl_deseq,
                           ctrl_limma=ctrl_limma,
                           LPS_deseq=LPS_deseq,
                           LPS_limma=LPS_limma) 

# obtain project directory
project_root <- rprojroot::find_root(has_dir("Meta-omic-Analysis-Capstone"))
# obtain comparison analysis directory

relative_path <- file.path("Meta-omic-Analysis-Capstone/Deliverable 6")

# file output paths
output.path.1 <- fs::path(project_root,
                         relative_path, "data",
                         "PBMC_CVID_top250_results.csv")
output.path.2 <- fs::path(project_root,
                         relative_path, "data",
                         "PBMC_CVID_all_results.csv")

# write .csv file into Deliverable 6 folder
write.csv(x=top.250.results.df, file=output.path.1, row.names = T)
write.csv(x=PBMC.deseq.res, file=output.path.2, row.names = T)